import pandas as pd
import pandas_profiling
import numpy as np
import matplotlib.pyplot as plt
%pylab inline
pylab.rcParams['figure.figsize'] = (12.0, 10.0)
import seaborn as sns
%matplotlib inline
from pandas.plotting import scatter_matrix
from scipy import stats
from scipy import stats, special
from sklearn import model_selection, metrics, linear_model, datasets, feature_selection
from seaborn import pairplot, heatmap
from sklearn.model_selection import train_test_split
# Importing All needed Libraries
vs = pd.read_excel('VehiclesSalesData.xls')
# Vehicles Sales is vs now to be easily recalled.
ev = pd.read_csv('2012-2020_Battery_Electric_Vehicles.csv')
# Battery Electric Vehicles is ev now to be easily recalled.
phev = pd.read_csv('2012-2020_Plug-in_Hybrid_Electric_Vehicles.csv')
# Plug in Hybrid Electric Vehicles is phev now to be easily recalled.
print("Vehicles Sales rows and columns:")
print(vs.shape)
print("Battery Electric Vehicles rows and columns:")
print(ev.shape)
print("Plug in Hybrid Electric Vehicles rows and columns:")
print(phev.shape)
# Number of rows and columns.
print("Vehicles Sales First & Last 5 rows:")
print(vs.head)
print("Battery Electric Vehicles First & Last 5 rows:")
print(ev.head)
print("Plug in Hybrid Electric Vehicles First & Last 5 rows:")
print(phev.head)
# First and last 5 rows for each data.
vs.describe(include='all')
# View statistical details.
ev.describe(include='all')
# View statistical details.
phev.describe(include='all')
# View statistical details.
vs.isnull().values.any() # Any null values?
ev.isnull().values.any() # Any null values?
phev.isnull().values.any() # Any null values?
vs.isnull().sum() # Where are the null values?
ev.isnull().sum() # Where are the null values?
phev.isnull().sum() # Where are the null values?
vs.duplicated().any() # Any duplicated values?
ev.duplicated().any() # Any duplicated values?
phev.duplicated().any() # Any duplicated values?
vs.info()
# Information about the data including the index, data type in columns.
ev.info()
# Information about the data including the index, data type in columns.
phev.info()
# Information about the data including the index, data type in columns.
vs.describe() # Looking for outliers.
ev.describe() # Looking for outliers.
phev.describe() # Looking for outliers.
vs
vs.fillna(0, inplace=True)
# Replace all blank spaces with zero or dropping the rows(to make visualization simple).
# Because replacing blank spaces with the mean value will make the sales count inaccurate.
vs.drop([
'DGUID', 'Statistics', 'UOM', 'UOM_ID', 'SCALAR_FACTOR', 'SCALAR_ID',
'VECTOR', 'COORDINATE', 'STATUS', 'SYMBOL', 'TERMINATED', 'DECIMALS'
],
axis=1,
inplace=True)
# Drop most of the columns as there is no difference or effect on our data.
# Keeping only the number of units sold per vehicle and fuel type in certain locations from 2011 to 2019.
vs.columns # View all Columns names before renaming.
vs.columns = [
'Date', 'Location', 'Fuel_Type', 'Vehicle_type', 'Number_of_Cars_sold'
]
# Changing the columns names to be more clear.
vs = vs[(vs['Fuel_Type'] != 'Diesel') & (vs['Fuel_Type'] != 'All fuel types')]
vs = vs[(vs['Fuel_Type'] != 'Other fuel types')
& (vs['Fuel_Type'] != 'Gasoline')]
# Drop some rows with inefficient fuel types(gasoline and diesel) and other fuel types(liquid propane).
vs # Let's have a look....is it readable now?
ev = pd.read_csv('2012-2020_Battery_Electric_Vehicles.csv', header=[0, 1])
ev.columns = ev.columns.map('_'.join)
# Combining the headers with the first row.
ev.replace(NaN, 0) # Replace all NAN with 0.
ev.columns # View all Columns names before renaming.
ev.columns = [
'Year', 'Make', 'Model', 'Vehicle Class', 'Motor(kW)', 'Transmission',
'Fuel Type', 'City(kWh/100 km)', 'Highway(kWh/100 km)',
'City+Highway(kWh/100 km)', 'City(Litre/100 km)', 'Highway(Litre/100 km)',
'City+Highway(Litre/100 km)', 'Range(km)', 'CO2 Emissions(g/km)',
'CO2 Rating', 'Smog Rating', 'Recharge_Time(hours)'
]
# Changing the columns names to be more clear.
ev.drop([
'Transmission', 'Fuel Type', 'City(Litre/100 km)', 'Highway(Litre/100 km)',
'City+Highway(Litre/100 km)', 'CO2 Emissions(g/km)', 'CO2 Rating',
'Smog Rating'
],
axis=1,
inplace=True)
# Drop Transmission, Fuel Type(All cars are electirc), Anything in Litre, all emissions as there is no difference or effect on our data.
ev # Let's have a look....is it readable now?
phev
phev = pd.read_csv('2012-2020_Plug-in_Hybrid_Electric_Vehicles.csv',
header=[0, 1])
phev.columns = phev.columns.map('_'.join)
# Combining the headers with the first row.
phev.columns = [
'Year', 'Make', 'Model', 'Vehicle Class', 'Motor(kW)',
'Engine Size(Litre)', 'Cylinders', 'Transmission',
'Fuel Type(Electricity/Gasoline)', 'Electricity/Gasoline(100km)',
'Range(km)', 'Recharge Time(hours)', 'Fuel Type 2(Gasoline only)',
'City(Litre/100km)', 'Highway(Litre/100km)', 'Highway&City(Litre/100km)',
'Combined_Range(km)', 'CO2 Emissions(g/km)', 'CO2 Rating', 'Smog Rating'
]
# Changing the columns names to be more clear.
phev.drop(['Transmission'], axis=1, inplace=True)
# Drop Transmission as there is no difference or effect on our data.
phev['CO2 Rating'].fillna(0, inplace=True)
# Replace all NAN with 0.
phev['Smog Rating'].fillna(0, inplace=True)
# Replace all NAN with 0.
phev # Let's have a look....is it readable now?
vs.to_csv('vs.csv')
ev.to_csv('ev.csv')
phev.to_csv('phev.csv')
# Saving the cleaned data for visulaization.
vsp = pandas_profiling.ProfileReport(vs)
# Pandas Profiling for Vehicle Sales
vsp.to_notebook_iframe()
# Pandas profiling interactive animation on the notebook(Html viewing)
sns.heatmap(vs.corr())
plt.title('Vehicle Sales');
# Visulazing the correlation on a heatmap.
sns.barplot(x="Number_of_Cars_sold", y="Location", hue="Fuel_Type", data=vs)
plt.xlabel('Units Sold')
plt.ylabel('Province')
plt.title('Number of cars sold in each location across Canada');
# Plotting using the barplot (How many car sold in each province?)
sns.pointplot(x="Number_of_Cars_sold", y="Location", hue="Fuel_Type", data=vs)
plt.xlabel('Units Sold')
plt.ylabel('Province')
plt.title('Number of cars sold in each location across Canada');
#Plotting on a pointplot (How many car sold in each province?)
plt.scatter(x=vs['Number_of_Cars_sold'],\
y=vs['Fuel_Type'],\
alpha=0.5,\
s=50,\
color='g',\
marker='x')
plt.xlabel('Units Sold')
plt.ylabel('Fuel Type')
plt.title('Number of cars sold per fuel type');
# Matplotlib (How many cars sold for each fuel type?)
# The whole data on a scatter matrix.
scatter_matrix(vs, alpha=0.5, figsize=(12, 12), diagonal='kde', s=100)
plt.title('Number of cars sold per year');
# Scatter Matrix (How many cars sold per calendar year?)
sns.factorplot(x="Number_of_Cars_sold",
y="Vehicle_type",
hue="Fuel_Type",
data=vs,
col="Date",
kind="box",
height=3.5,
aspect=1.5)
plt.title('Cars sold by vehicle and fuel type over the years');
# Plotting using factorplot with specific size and aspect.(Number of cars sold by vehicle type per calendar year)
g = sns.PairGrid(vs,
x_vars=["Number_of_Cars_sold", "Date"],
y_vars=["Fuel_Type", "Location"],
aspect=1.75,
height=2.25)
g.map(sns.violinplot, palette="Blues_r")
plt.title('Cars sold over the years for different fuel type among Canada');
# Showing categorical relationship on a pair grid & a violinplot.
pev = pandas_profiling.ProfileReport(ev)
# Pandas Profiling for Electric Vehicle.
pev.to_notebook_iframe()
# Pandas profiling interactive animation on the notebook(Html viewing)
sns.heatmap(ev.corr())
plt.title('Electric vehicle')
# Visulazing the correlation on a heatmap.
plt.scatter(x=ev['Range(km)'],\
y=ev['Recharge_Time(hours)'],\
alpha=0.5,\
s=50,\
color='b',\
marker='x')
plt.xlabel('Range per Km')
plt.ylabel('Charging Time hourly')
plt.title('Range by Km for hourly charging')
# Matplotlib (How long can the car go with one charge?)
sns.factorplot(x="Recharge_Time(hours)",
y="Range(km)",
data=ev,
col="Make",
kind="box",
height=3.5,
aspect=1.5)
plt.title('Range by Km for hourly charging for each car make');
# Plotting using factorplot with specific size and aspect.(How long can each model go with one charge?)
sns.heatmap(phev.corr())
plt.title('Plug-in Hybrid Electric Vehicle')
# Visulazing the correlation on a heatmap.
sns.barplot(x="Combined_Range(km)", y="Make", data=phev)
plt.xlabel('Range by km for hybrid vehicles')
plt.ylabel('Make of the car')
plt.title('Range by make of Hybrid cars');
# Plotting using the barplot (Which car can go longer?)
train1, test1 = model_selection.train_test_split(vs)
train2, test2 = model_selection.train_test_split(ev)
train3, test3 = model_selection.train_test_split(phev)
len(test1) / (len(train1) + len(test1))
len(test2) / (len(train2) + len(test2))
len(test3) / (len(train3) + len(test3))
vs = vs.sample(frac=1)
ev = ev.sample(frac=1)
phev = phev.sample(frac=1)
train1
test1
train2
test2
train3
test3
slope, intercept, r_value, p_value, slope_std_error = stats.linregress(
train1.index, train1['Number_of_Cars_sold'])
train_regr = intercept + slope * train1.index
scatter(train1.index, train1['Number_of_Cars_sold'])
plt.plot(train1.index, train_regr, color='red')
slope, intercept, r_value, p_value, slope_std_error = stats.linregress(
train2.index, train2['Range(km)'])
train_regr = intercept + slope * train2.index
scatter(train2.index, train2['Range(km)'])
plt.plot(train2.index, train_regr, color='red')
slope, intercept, r_value, p_value, slope_std_error = stats.linregress(
train3.index, train3['Combined_Range(km)'])
train_regr = intercept + slope * train3.index
scatter(train3.index, train3['Combined_Range(km)'])
plt.plot(train3.index, train_regr, color='red')
!jupyter nbconvert EV.ipynb --to 'html'